In [3]:
pip install pandas numpy scikit-learn matplotlib seaborn
Requirement already satisfied: pandas in c:\users\lenovo\anaconda3\lib\site-packages (2.2.2) Requirement already satisfied: numpy in c:\users\lenovo\anaconda3\lib\site-packages (1.26.4) Requirement already satisfied: scikit-learn in c:\users\lenovo\anaconda3\lib\site-packages (1.5.1) Requirement already satisfied: matplotlib in c:\users\lenovo\anaconda3\lib\site-packages (3.9.2) Requirement already satisfied: seaborn in c:\users\lenovo\anaconda3\lib\site-packages (0.13.2) Requirement already satisfied: python-dateutil>=2.8.2 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2.9.0.post0) Requirement already satisfied: pytz>=2020.1 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2024.1) Requirement already satisfied: tzdata>=2022.7 in c:\users\lenovo\anaconda3\lib\site-packages (from pandas) (2023.3) Requirement already satisfied: scipy>=1.6.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (1.13.1) Requirement already satisfied: joblib>=1.2.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (1.4.2) Requirement already satisfied: threadpoolctl>=3.1.0 in c:\users\lenovo\anaconda3\lib\site-packages (from scikit-learn) (3.5.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (24.1) Requirement already satisfied: pillow>=8 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (10.4.0) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib) (3.1.2) Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda3\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.16.0) Note: you may need to restart the kernel to use updated packages.
In [7]:
import pandas as pd
# Load your dataset (replace the path with the correct dataset file path)
df = pd.read_csv("car_price_prediction_.csv")
# Preview the dataset
print(df.head())
Car ID Brand Year Engine Size Fuel Type Transmission Mileage Condition \
0 1 Tesla 2016 2.3 Petrol Manual 114832 New
1 2 BMW 2018 4.4 Electric Manual 143190 Used
2 3 Audi 2013 4.5 Electric Manual 181601 New
3 4 Tesla 2011 4.1 Diesel Automatic 68682 New
4 5 Ford 2009 2.6 Diesel Manual 223009 Like New
Price Model
0 26613.92 Model X
1 14679.61 5 Series
2 44402.61 A4
3 86374.33 Model Y
4 73577.10 Mustang
In [9]:
print(df.isnull().sum())
Car ID 0 Brand 0 Year 0 Engine Size 0 Fuel Type 0 Transmission 0 Mileage 0 Condition 0 Price 0 Model 0 dtype: int64
In [19]:
df.head()
Out[19]:
| Car ID | Brand | Year | Engine Size | Fuel Type | Transmission | Mileage | Condition | Price | Model | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Tesla | 2016 | 2.3 | Petrol | Manual | 114832 | New | 26613.92 | Model X |
| 1 | 2 | BMW | 2018 | 4.4 | Electric | Manual | 143190 | Used | 14679.61 | 5 Series |
| 2 | 3 | Audi | 2013 | 4.5 | Electric | Manual | 181601 | New | 44402.61 | A4 |
| 3 | 4 | Tesla | 2011 | 4.1 | Diesel | Automatic | 68682 | New | 86374.33 | Model Y |
| 4 | 5 | Ford | 2009 | 2.6 | Diesel | Manual | 223009 | Like New | 73577.10 | Mustang |
In [21]:
df.shape
Out[21]:
(2500, 10)
In [23]:
df.isnull().sum()
Out[23]:
Car ID 0 Brand 0 Year 0 Engine Size 0 Fuel Type 0 Transmission 0 Mileage 0 Condition 0 Price 0 Model 0 dtype: int64
In [25]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2500 entries, 0 to 2499 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Car ID 2500 non-null int64 1 Brand 2500 non-null object 2 Year 2500 non-null int64 3 Engine Size 2500 non-null float64 4 Fuel Type 2500 non-null object 5 Transmission 2500 non-null object 6 Mileage 2500 non-null int64 7 Condition 2500 non-null object 8 Price 2500 non-null float64 9 Model 2500 non-null object dtypes: float64(2), int64(3), object(5) memory usage: 195.4+ KB
In [27]:
df.isna().sum()
Out[27]:
Car ID 0 Brand 0 Year 0 Engine Size 0 Fuel Type 0 Transmission 0 Mileage 0 Condition 0 Price 0 Model 0 dtype: int64
In [29]:
df.describe()
Out[29]:
| Car ID | Year | Engine Size | Mileage | Price | |
|---|---|---|---|---|---|
| count | 2500.00000 | 2500.0000 | 2500.000000 | 2500.000000 | 2500.000000 |
| mean | 1250.50000 | 2011.6268 | 3.465240 | 149749.844800 | 52638.022532 |
| std | 721.83216 | 6.9917 | 1.432053 | 87919.952034 | 27295.833455 |
| min | 1.00000 | 2000.0000 | 1.000000 | 15.000000 | 5011.270000 |
| 25% | 625.75000 | 2005.0000 | 2.200000 | 71831.500000 | 28908.485000 |
| 50% | 1250.50000 | 2012.0000 | 3.400000 | 149085.000000 | 53485.240000 |
| 75% | 1875.25000 | 2018.0000 | 4.700000 | 225990.500000 | 75838.532500 |
| max | 2500.00000 | 2023.0000 | 6.000000 | 299967.000000 | 99982.590000 |
In [31]:
df.describe().T.plot(kind='bar')
Out[31]:
<Axes: >
In [35]:
import plotly.express as px
import pandas as pd
# Assuming df is already defined
# اختيار الأعمدة الرقمية فقط
numeric_df = df.select_dtypes(include=[float, int])
# حساب مصفوفة معامل الارتباط
correlation_matrix = numeric_df.corr()
# رسم خريطة حرارية لمصفوفة معامل الارتباط باستخدام Plotly
fig = px.imshow(correlation_matrix, text_auto=True, aspect="auto",
title='Correlation Matrix',
color_continuous_scale='RdBu_r')
fig.show()
In [39]:
df.columns.tolist()
Out[39]:
['Car ID', 'Brand', 'Year', 'Engine Size', 'Fuel Type', 'Transmission', 'Mileage', 'Condition', 'Price', 'Model']
In [41]:
for col in df:
sns.histplot(x=col,data=df,kde=True)
plt.show()
In [43]:
for col in df:
if df[col].dtype == 'O':
sns.countplot(x=col,data=df)
plt.show()
In [45]:
import plotly.express as px
import pandas as pd
# Assuming df is already defined and contains the data
# Define the columns you want to plot
columns =['Car ID',
'Brand',
'Year',
'Engine Size',
'Fuel Type',
'Transmission',
'Mileage',
'Condition',
'Price',
'Model']
# Iterate through each column and create a bar chart or histogram
for column in columns:
try:
if column not in df.columns:
print(f"Column {column} does not exist in the DataFrame")
continue
# Check if the column is suitable for a bar chart (categorical data)
if df[column].dtype == 'object' or df[column].dtype.name == 'category':
# Count the occurrences of each unique value
column_counts = df[column].value_counts().reset_index()
column_counts.columns = [column, 'count']
# Create a bar chart using plotly.express
fig = px.bar(
column_counts,
x=column,
y='count',
title=f'Distribution of {column}',
labels={column: column, 'count': 'Count'},
text='count'
)
# Update layout for better readability
fig.update_layout(
xaxis_title=column,
yaxis_title='Count',
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title_font=dict(size=18, family="Arial"),
xaxis={'categoryorder': 'total descending'}
)
# Show the figure
fig.show()
# For numerical data
elif df[column].dtype in ['int64', 'float64']:
# Create a histogram for numerical columns
fig = px.histogram(
df,
x=column,
title=f'Distribution of {column}',
labels={column: column, 'count': 'Count'}
)
# Update layout for better readability
fig.update_layout(
xaxis_title=column,
yaxis_title='Count',
paper_bgcolor='rgba(0,0,0,0)',
plot_bgcolor='rgba(0,0,0,0)',
title_font=dict(size=18, family="Arial")
)
# Show the figure
fig.show()
except Exception as e:
print(f"Could not create plot for column {column}: {e}")
In [47]:
df
Out[47]:
| Car ID | Brand | Year | Engine Size | Fuel Type | Transmission | Mileage | Condition | Price | Model | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Tesla | 2016 | 2.3 | Petrol | Manual | 114832 | New | 26613.92 | Model X |
| 1 | 2 | BMW | 2018 | 4.4 | Electric | Manual | 143190 | Used | 14679.61 | 5 Series |
| 2 | 3 | Audi | 2013 | 4.5 | Electric | Manual | 181601 | New | 44402.61 | A4 |
| 3 | 4 | Tesla | 2011 | 4.1 | Diesel | Automatic | 68682 | New | 86374.33 | Model Y |
| 4 | 5 | Ford | 2009 | 2.6 | Diesel | Manual | 223009 | Like New | 73577.10 | Mustang |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2495 | 2496 | Audi | 2020 | 2.4 | Petrol | Automatic | 22650 | Like New | 61384.10 | Q5 |
| 2496 | 2497 | Audi | 2001 | 5.7 | Hybrid | Manual | 77701 | Like New | 24710.35 | A3 |
| 2497 | 2498 | Ford | 2021 | 1.1 | Hybrid | Manual | 272827 | Like New | 29902.45 | Fiesta |
| 2498 | 2499 | Audi | 2002 | 4.5 | Diesel | Manual | 229164 | Like New | 46085.67 | Q5 |
| 2499 | 2500 | Toyota | 2005 | 4.6 | Diesel | Automatic | 80978 | Used | 16594.14 | RAV4 |
2500 rows × 10 columns
In [52]:
!pip install WordCloud
Requirement already satisfied: WordCloud in c:\users\lenovo\anaconda3\lib\site-packages (1.9.4) Requirement already satisfied: numpy>=1.6.1 in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (1.26.4) Requirement already satisfied: pillow in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (10.4.0) Requirement already satisfied: matplotlib in c:\users\lenovo\anaconda3\lib\site-packages (from WordCloud) (3.9.2) Requirement already satisfied: contourpy>=1.0.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (1.2.0) Requirement already satisfied: cycler>=0.10 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (0.11.0) Requirement already satisfied: fonttools>=4.22.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (4.51.0) Requirement already satisfied: kiwisolver>=1.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (1.4.4) Requirement already satisfied: packaging>=20.0 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (24.1) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (3.1.2) Requirement already satisfied: python-dateutil>=2.7 in c:\users\lenovo\anaconda3\lib\site-packages (from matplotlib->WordCloud) (2.9.0.post0) Requirement already satisfied: six>=1.5 in c:\users\lenovo\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib->WordCloud) (1.16.0)
In [53]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from collections import Counter
import pandas as pd
# Assuming `dataset` is your DataFrame and `stop_words_list` is a list of stopwords
# Example stopwords list, modify as needed
stop_words_list = set(STOPWORDS)
# Generate frequency counts from the 'business_category' column
counts = Counter(df["Brand"].dropna().apply(lambda x: str(x)))
# Generate the word cloud
wcc = WordCloud(
background_color="black",
width=1600, height=800,
max_words=2000,
stopwords=stop_words_list
)
wcc.generate_from_frequencies(counts)
# Display the word cloud
plt.figure(figsize=(10, 5), facecolor='k')
plt.imshow(wcc, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()
In [56]:
df.drop(columns = ["Car ID"],inplace = True )
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
le = LabelEncoder()
df["Brand"] = le.fit_transform(df["Brand"])
le = LabelEncoder()
df["Fuel Type"] = le.fit_transform(df["Fuel Type"])
le = LabelEncoder()
df["Transmission"] = le.fit_transform(df["Transmission"])
le = LabelEncoder()
df["Condition"] = le.fit_transform(df["Condition"])
le = LabelEncoder()
df["Model"] = le.fit_transform(df["Model"])
In [58]:
x = df.drop("Price", axis=1)
y = df["Price"]
scaler_df = StandardScaler()
x = pd.DataFrame(scaler_df. fit_transform(x),columns = x.columns)
plt.figure(figsize = (12,8))
sns.heatmap(data = df.corr(),annot = True, cmap='viridis')
plt.show()
In [60]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size = 0.2, random_state = 30)
from sklearn.linear_model import LinearRegression,Lasso, Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error , mean_absolute_error
dt =DecisionTreeRegressor()
dt.fit(x_train,y_train)
dt.score(x_test,y_test)*100, dt.score(x_train,y_train)*100
Out[60]:
(-103.67627129564703, 100.0)
In [62]:
mean_squared_error(y_test ,dt.predict(x_test)),mean_absolute_error(y_test ,dt.predict(x_test))
Out[62]:
(1563338739.6327448, 32135.337119999997)
In [64]:
x_test
Out[64]:
| Brand | Year | Engine Size | Fuel Type | Transmission | Mileage | Condition | Model | |
|---|---|---|---|---|---|---|---|---|
| 679 | 1.484284 | 1.340886 | -0.255097 | -0.426771 | -1.047528 | 0.151204 | 1.206712 | 1.078942 |
| 1062 | -1.485868 | -1.234110 | -0.744004 | -1.311453 | 0.954628 | -1.639795 | -1.225194 | 1.325715 |
| 2114 | 0.989259 | -1.663276 | -1.232910 | 0.457911 | -1.047528 | 0.905017 | -0.009241 | 0.462009 |
| 524 | -0.000792 | 0.625610 | 0.233809 | 1.342593 | 0.954628 | -0.498392 | 1.206712 | -1.142017 |
| 1636 | -1.485868 | 0.196444 | -0.255097 | -0.426771 | -1.047528 | 0.833574 | -1.225194 | 1.325715 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 552 | -1.485868 | -1.663276 | 0.652872 | -1.311453 | -1.047528 | -1.436865 | -0.009241 | -1.265404 |
| 490 | 0.494233 | 0.768665 | -1.442442 | 1.342593 | -1.047528 | 1.080041 | 1.206712 | 0.215236 |
| 1883 | 0.494233 | -0.947999 | -1.512286 | 1.342593 | -1.047528 | -0.198628 | -0.009241 | -0.401697 |
| 20 | 0.494233 | 1.054776 | 0.303653 | 1.342593 | -1.047528 | 0.487565 | 1.206712 | -1.018630 |
| 199 | -1.485868 | -1.663276 | 0.233809 | -1.311453 | -1.047528 | 1.448143 | 1.206712 | 1.202328 |
500 rows × 8 columns
In [66]:
dt.predict([[1.484284,1.340886,-0.255097,-0.426771,-1.047528,0.151204,1.206712,1.078942]])
C:\Users\LENOVO\anaconda3\Lib\site-packages\sklearn\base.py:493: UserWarning: X does not have valid feature names, but DecisionTreeRegressor was fitted with feature names
Out[66]:
array([89389.53])
In [68]:
y_test.head()
Out[68]:
679 70016.62 1062 94827.57 2114 21792.22 524 10986.59 1636 67863.46 Name: Price, dtype: float64
In [70]:
df.head()
Out[70]:
| Brand | Year | Engine Size | Fuel Type | Transmission | Mileage | Condition | Price | Model | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 5 | 2016 | 2.3 | 3 | 1 | 114832 | 1 | 26613.92 | 19 |
| 1 | 1 | 2018 | 4.4 | 1 | 1 | 143190 | 2 | 14679.61 | 1 |
| 2 | 0 | 2013 | 4.5 | 1 | 1 | 181601 | 1 | 44402.61 | 3 |
| 3 | 5 | 2011 | 4.1 | 0 | 0 | 68682 | 1 | 86374.33 | 20 |
| 4 | 2 | 2009 | 2.6 | 0 | 1 | 223009 | 0 | 73577.10 | 21 |
In [ ]: